*** How worried should we be? The implications of fabricated survey data for political science
*** Figure A12. Similarity Among Fraudulent and Real Interviews in Peru
*** Must have percentmatch package installed

set more off

* set directory to location of dataset in following line
cd "C:\~\Downloads\"

use "QAC_2017_12.21.17.dta", clear

keep if pais == 11

gen clean_data = 1 if cancelled == 0
replace clean_data = 0 if cancelled == 1
replace clean_data = 0 if ls3 == 999999
replace clean_data = 0 if fraud == 1
replace fraud = 0 if ls3 == 999999

** gen age in years
gen birth_yr = q2
replace birth_yr = . if inlist(birth_yr, 888888, 988888, 999999)

gen svy_date = date(date, "MDYhm")
format svy_date %td
gen year = year(svy_date)

gen upload_time = Clock(upload, "MDYhm")
format upload_time %tC
gen upload_hour = hhC(upload_time)

gen age_yrs = year - birth_yr
gen edad = 1 if age_yrs >= 18 & age_yrs <=25
replace edad = 2 if age_yrs >= 26 & age_yrs <=35
replace edad = 3 if age_yrs >= 36 & age_yrs <=45
replace edad = 4 if age_yrs >= 46 & age_yrs <=55
replace edad = 5 if age_yrs >= 56 & age_yrs <=65
replace edad = 6 if age_yrs >= 66 
label var edad "Age Cohort"
label define agegroup 1 "18-25 years old" 2 "26-35 years old" 3 "36-45 years old" 4 "46-55 years old" 5 "56-65 years old" 6 "66+ years old" 
label values edad agegroup

* education level
gen ed_level = 1 if inlist(ed, 0, 1, 2, 3, 4, 5, 6)
replace ed_level = 2 if inlist(ed, 7, 8, 9, 10, 11)
replace ed_level = 3 if inlist(ed, 12, 13, 14, 15, 16, 17, 18)
label var ed_level "Education Level"
label define edlevel 1 "None or Primary Ed." 2 "Secondary Ed." 3 "University+ Ed."
label values ed_level edlevel
label var ed "Years of Education"
replace ed = . if inlist(ed, 888888, 988888, 999999)

** matching
 set seed 339487731
* first flag cases that were canceled but not labeled as fraudulent
gen canceled_nonfraud = 1 if status == "Canceled" & fraud == 0
replace canceled_nonfraud = 0 if canceled_nonfraud == .
drop if canceled_nonfraud == 1

cem upm1a (#0) q1 (#0) agequota (#0) if canceled_nonfraud == 0, tr(fraud) k2k

* loop over all questions
replace dst1b = dst1b1 if dst1b2 == 999999 & pais != 22
replace dst1b = dst1b2 if dst1b1 == 999999 & pais != 22
replace env2b = env2b1 if env2b2 == 999999 & pais != 22
replace env2b = env2b2 if env2b1 == 999999 & pais != 22
replace drk1 = drk11 if drk12 == 999999 & pais != 22
replace drk1 = drk12 if drk11 == 999999 & pais != 22
replace env1c = env1c1 if env1c2 == 999999 & pais != 22
replace env1c = env1c2 if env1c1 == 999999 & pais != 22
replace mil10un = mil10un1 if mil10un2 == 999999 & pais != 22
replace mil10un = mil10un2 if mil10un1 == 999999 & pais != 22
replace mil10a = mil10a1 if mil10a2 == 999999 & pais != 22
replace mil10a = mil10a2 if mil10a1 == 999999 & pais != 22
replace mil10e = mil10e1 if mil10e2 == 999999 & pais != 22
replace mil10e = mil10e2 if mil10e1 == 999999 & pais != 22
replace mil10oas = mil10oas1 if mil10oas2 == 999999 & pais != 7
replace mil10oas = mil10oas2 if mil10oas1 == 999999 & pais != 7
replace mil10oas = mil10oa if pais == 22

* percentmatch from Kuriakose and Robbins 2015
percentmatch ls3 soct2 idio2 sgl1 cp6 cp7 cp8 cp13 cp20 it1 l1 aoj11 aoj12 ///
b1 b2 b3 b4 b6 b43 b12 b13 b18 b21 b21a b32 b37 b47a m1 m2 sd2new2 sd3new2 sd6new2 ///
infrax infra3 ros1 ros4 ing4 eff1 eff2 media3 media4 pn4 e5 e15 d1 d2 d3 d4 d5 d6 ///
lib1 lib2b lib2c lib4 exc7new pol1 ccq2 ccq4 q5a q5b q10e www1 gi0 e16 aoj22new ///
ivv3 exc7 ie3 ie6 ie9 ie10 envp8, gen(pmatch) id(sbjnum) matchedid(m_id)

* Figure A12. percentmatch score distributions
twoway (hist pmatch if cem_matched == 1 & fraud == 0, freq discrete color(green)) ///
	   (hist pmatch if cem_matched == 1 & fraud == 1, freq discrete fcolor(none) lcolor(black)), ///
	   legend(order(1 "Clean" 2 "Fraudulent")) xtitle("percentmatch") graphregion(color(white))

